In [325]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
In [326]:
pd.set_option('display.max_columns',None)
data=pd.read_csv("Life Expectancy Data.csv")
In [327]:
data.head()
Out[327]:
Country Year Status Life expectancy Adult Mortality infant deaths Alcohol percentage expenditure Hepatitis B Measles BMI under-five deaths Polio Total expenditure Diphtheria HIV/AIDS GDP Population thinness 1-19 years thinness 5-9 years Income composition of resources Schooling
0 Afghanistan 2015 Developing 65.0 263.0 62 0.01 71.279624 65.0 1154 19.1 83 6.0 8.16 65.0 0.1 584.259210 33736494.0 17.2 17.3 0.479 10.1
1 Afghanistan 2014 Developing 59.9 271.0 64 0.01 73.523582 62.0 492 18.6 86 58.0 8.18 62.0 0.1 612.696514 327582.0 17.5 17.5 0.476 10.0
2 Afghanistan 2013 Developing 59.9 268.0 66 0.01 73.219243 64.0 430 18.1 89 62.0 8.13 64.0 0.1 631.744976 31731688.0 17.7 17.7 0.470 9.9
3 Afghanistan 2012 Developing 59.5 272.0 69 0.01 78.184215 67.0 2787 17.6 93 67.0 8.52 67.0 0.1 669.959000 3696958.0 17.9 18.0 0.463 9.8
4 Afghanistan 2011 Developing 59.2 275.0 71 0.01 7.097109 68.0 3013 17.2 97 68.0 7.87 68.0 0.1 63.537231 2978599.0 18.2 18.2 0.454 9.5
In [328]:
data.shape
Out[328]:
(2938, 22)
In [329]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2928 non-null   float64
 4   Adult Mortality                  2928 non-null   float64
 5   infant deaths                    2938 non-null   int64  
 6   Alcohol                          2744 non-null   float64
 7   percentage expenditure           2938 non-null   float64
 8   Hepatitis B                      2385 non-null   float64
 9   Measles                          2938 non-null   int64  
 10   BMI                             2904 non-null   float64
 11  under-five deaths                2938 non-null   int64  
 12  Polio                            2919 non-null   float64
 13  Total expenditure                2712 non-null   float64
 14  Diphtheria                       2919 non-null   float64
 15   HIV/AIDS                        2938 non-null   float64
 16  GDP                              2490 non-null   float64
 17  Population                       2286 non-null   float64
 18   thinness  1-19 years            2904 non-null   float64
 19   thinness 5-9 years              2904 non-null   float64
 20  Income composition of resources  2771 non-null   float64
 21  Schooling                        2775 non-null   float64
dtypes: float64(16), int64(4), object(2)
memory usage: 505.1+ KB
In [330]:
data1=data.copy()
In [331]:
data1.shape
Out[331]:
(2938, 22)
In [332]:
data1.isnull().sum()
Out[332]:
Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
 BMI                                34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
 HIV/AIDS                            0
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
dtype: int64
In [333]:
data1.isnull().sum().sum()
Out[333]:
2563
In [334]:
data1['Country'].value_counts()
Out[334]:
Country
Afghanistan              16
Peru                     16
Nicaragua                16
Niger                    16
Nigeria                  16
                         ..
Niue                      1
San Marino                1
Nauru                     1
Saint Kitts and Nevis     1
Dominica                  1
Name: count, Length: 193, dtype: int64
In [335]:
data1.describe().T
Out[335]:
count mean std min 25% 50% 75% max
Year 2938.0 2.007519e+03 4.613841e+00 2000.00000 2004.000000 2.008000e+03 2.012000e+03 2.015000e+03
Life expectancy 2928.0 6.922493e+01 9.523867e+00 36.30000 63.100000 7.210000e+01 7.570000e+01 8.900000e+01
Adult Mortality 2928.0 1.647964e+02 1.242921e+02 1.00000 74.000000 1.440000e+02 2.280000e+02 7.230000e+02
infant deaths 2938.0 3.030395e+01 1.179265e+02 0.00000 0.000000 3.000000e+00 2.200000e+01 1.800000e+03
Alcohol 2744.0 4.602861e+00 4.052413e+00 0.01000 0.877500 3.755000e+00 7.702500e+00 1.787000e+01
percentage expenditure 2938.0 7.382513e+02 1.987915e+03 0.00000 4.685343 6.491291e+01 4.415341e+02 1.947991e+04
Hepatitis B 2385.0 8.094046e+01 2.507002e+01 1.00000 77.000000 9.200000e+01 9.700000e+01 9.900000e+01
Measles 2938.0 2.419592e+03 1.146727e+04 0.00000 0.000000 1.700000e+01 3.602500e+02 2.121830e+05
BMI 2904.0 3.832125e+01 2.004403e+01 1.00000 19.300000 4.350000e+01 5.620000e+01 8.730000e+01
under-five deaths 2938.0 4.203574e+01 1.604455e+02 0.00000 0.000000 4.000000e+00 2.800000e+01 2.500000e+03
Polio 2919.0 8.255019e+01 2.342805e+01 3.00000 78.000000 9.300000e+01 9.700000e+01 9.900000e+01
Total expenditure 2712.0 5.938190e+00 2.498320e+00 0.37000 4.260000 5.755000e+00 7.492500e+00 1.760000e+01
Diphtheria 2919.0 8.232408e+01 2.371691e+01 2.00000 78.000000 9.300000e+01 9.700000e+01 9.900000e+01
HIV/AIDS 2938.0 1.742103e+00 5.077785e+00 0.10000 0.100000 1.000000e-01 8.000000e-01 5.060000e+01
GDP 2490.0 7.483158e+03 1.427017e+04 1.68135 463.935626 1.766948e+03 5.910806e+03 1.191727e+05
Population 2286.0 1.275338e+07 6.101210e+07 34.00000 195793.250000 1.386542e+06 7.420359e+06 1.293859e+09
thinness 1-19 years 2904.0 4.839704e+00 4.420195e+00 0.10000 1.600000 3.300000e+00 7.200000e+00 2.770000e+01
thinness 5-9 years 2904.0 4.870317e+00 4.508882e+00 0.10000 1.500000 3.300000e+00 7.200000e+00 2.860000e+01
Income composition of resources 2771.0 6.275511e-01 2.109036e-01 0.00000 0.493000 6.770000e-01 7.790000e-01 9.480000e-01
Schooling 2775.0 1.199279e+01 3.358920e+00 0.00000 10.100000 1.230000e+01 1.430000e+01 2.070000e+01
In [336]:
data1.describe().sum()
Out[336]:
Year                               1.498913e+04
Life expectancy                    3.342949e+03
Adult Mortality                    4.387089e+03
infant deaths                      4.911230e+03
Alcohol                            2.782870e+03
percentage expenditure             2.565521e+04
Hepatitis B                        2.857010e+03
Measles                            2.293851e+05
 BMI                               3.169665e+03
under-five deaths                  5.672481e+03
Polio                              3.394978e+03
Total expenditure                  2.755914e+03
Diphtheria                         3.394041e+03
 HIV/AIDS                          2.996520e+03
GDP                                1.515594e+05
Population                         1.376630e+09
 thinness  1-19 years              2.953160e+03
 thinness 5-9 years                2.954079e+03
Income composition of resources    2.774735e+03
Schooling                          2.847752e+03
dtype: float64
In [337]:
data1.duplicated().sum()
Out[337]:
0
In [338]:
for i in data.select_dtypes(include='number').columns:
    sns.histplot(data=data,x=i)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [339]:
sns.pairplot(data)
Out[339]:
<seaborn.axisgrid.PairGrid at 0x1b7ca3b9d30>
No description has been provided for this image
In [340]:
for i in data.select_dtypes(include='number').columns:
    sns.boxplot(data=data,x=i)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [341]:
data['Country'].unique()
Out[341]:
array(['Afghanistan', 'Albania', 'Algeria', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan',
       'Bolivia (Plurinational State of)', 'Bosnia and Herzegovina',
       'Botswana', 'Brazil', 'Brunei Darussalam', 'Bulgaria',
       'Burkina Faso', 'Burundi', "Côte d'Ivoire", 'Cabo Verde',
       'Cambodia', 'Cameroon', 'Canada', 'Central African Republic',
       'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo',
       'Cook Islands', 'Costa Rica', 'Croatia', 'Cuba', 'Cyprus',
       'Czechia', "Democratic People's Republic of Korea",
       'Democratic Republic of the Congo', 'Denmark', 'Djibouti',
       'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia',
       'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon', 'Gambia',
       'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala',
       'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras',
       'Hungary', 'Iceland', 'India', 'Indonesia',
       'Iran (Islamic Republic of)', 'Iraq', 'Ireland', 'Israel', 'Italy',
       'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Kiribati',
       'Kuwait', 'Kyrgyzstan', "Lao People's Democratic Republic",
       'Latvia', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Lithuania',
       'Luxembourg', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives',
       'Mali', 'Malta', 'Marshall Islands', 'Mauritania', 'Mauritius',
       'Mexico', 'Micronesia (Federated States of)', 'Monaco', 'Mongolia',
       'Montenegro', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia',
       'Nauru', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragua',
       'Niger', 'Nigeria', 'Niue', 'Norway', 'Oman', 'Pakistan', 'Palau',
       'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines',
       'Poland', 'Portugal', 'Qatar', 'Republic of Korea',
       'Republic of Moldova', 'Romania', 'Russian Federation', 'Rwanda',
       'Saint Kitts and Nevis', 'Saint Lucia',
       'Saint Vincent and the Grenadines', 'Samoa', 'San Marino',
       'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia',
       'Seychelles', 'Sierra Leone', 'Singapore', 'Slovakia', 'Slovenia',
       'Solomon Islands', 'Somalia', 'South Africa', 'South Sudan',
       'Spain', 'Sri Lanka', 'Sudan', 'Suriname', 'Swaziland', 'Sweden',
       'Switzerland', 'Syrian Arab Republic', 'Tajikistan', 'Thailand',
       'The former Yugoslav republic of Macedonia', 'Timor-Leste', 'Togo',
       'Tonga', 'Trinidad and Tobago', 'Tunisia', 'Turkey',
       'Turkmenistan', 'Tuvalu', 'Uganda', 'Ukraine',
       'United Arab Emirates',
       'United Kingdom of Great Britain and Northern Ireland',
       'United Republic of Tanzania', 'United States of America',
       'Uruguay', 'Uzbekistan', 'Vanuatu',
       'Venezuela (Bolivarian Republic of)', 'Viet Nam', 'Yemen',
       'Zambia', 'Zimbabwe'], dtype=object)
In [342]:
for i in [ 'Adult Mortality', 'Alcohol',
       'percentage expenditure', 'Hepatitis B', ' BMI ', 'Polio',
       'Total expenditure', 'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population',
       ' thinness  1-19 years', ' thinness 5-9 years',
       'Income composition of resources', 'Schooling']:
    data[i].fillna(data[i].mean(),inplace=True)
C:\Users\rithe\AppData\Local\Temp\ipykernel_13764\272504829.py:6: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[i].fillna(data[i].mean(),inplace=True)
In [343]:
data.isnull().sum()
Out[343]:
Country                             0
Year                                0
Status                              0
Life expectancy                    10
Adult Mortality                     0
infant deaths                       0
Alcohol                             0
percentage expenditure              0
Hepatitis B                         0
Measles                             0
 BMI                                0
under-five deaths                   0
Polio                               0
Total expenditure                   0
Diphtheria                          0
 HIV/AIDS                           0
GDP                                 0
Population                          0
 thinness  1-19 years               0
 thinness 5-9 years                 0
Income composition of resources     0
Schooling                           0
dtype: int64
In [344]:
for i in data.select_dtypes(include='number').columns:
    sns.boxplot(data=data,x=i)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [345]:
s1=data.select_dtypes(include='number').corr()
In [346]:
plt.figure(figsize=(10,12))
sns.heatmap(s1,annot=True)
Out[346]:
<Axes: >
No description has been provided for this image
In [347]:
plt.pie(satus1,labels=satus1.index,autopct='%1.1f%%',startangle=90)
plt.show()
No description has been provided for this image

remove the column spaces

In [348]:
data.columns = data.columns.str.strip()
In [349]:
data.columns
Out[349]:
Index(['Country', 'Year', 'Status', 'Life expectancy', 'Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       'Measles', 'BMI', 'under-five deaths', 'Polio', 'Total expenditure',
       'Diphtheria', 'HIV/AIDS', 'GDP', 'Population', 'thinness  1-19 years',
       'thinness 5-9 years', 'Income composition of resources', 'Schooling'],
      dtype='object')
In [350]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2928 non-null   float64
 4   Adult Mortality                  2938 non-null   float64
 5   infant deaths                    2938 non-null   int64  
 6   Alcohol                          2938 non-null   float64
 7   percentage expenditure           2938 non-null   float64
 8   Hepatitis B                      2938 non-null   float64
 9   Measles                          2938 non-null   int64  
 10  BMI                              2938 non-null   float64
 11  under-five deaths                2938 non-null   int64  
 12  Polio                            2938 non-null   float64
 13  Total expenditure                2938 non-null   float64
 14  Diphtheria                       2938 non-null   float64
 15  HIV/AIDS                         2938 non-null   float64
 16  GDP                              2938 non-null   float64
 17  Population                       2938 non-null   float64
 18  thinness  1-19 years             2938 non-null   float64
 19  thinness 5-9 years               2938 non-null   float64
 20  Income composition of resources  2938 non-null   float64
 21  Schooling                        2938 non-null   float64
dtypes: float64(16), int64(4), object(2)
memory usage: 505.1+ KB

Now we are doing Label encoding for the Categorical Data

In [351]:
from sklearn.preprocessing import LabelEncoder
In [352]:
le = LabelEncoder()
data['Country_encoded'] = le.fit_transform(data['Country'])
In [353]:
data[['Country', 'Country_encoded']]
Out[353]:
Country Country_encoded
0 Afghanistan 0
1 Afghanistan 0
2 Afghanistan 0
3 Afghanistan 0
4 Afghanistan 0
... ... ...
2933 Zimbabwe 192
2934 Zimbabwe 192
2935 Zimbabwe 192
2936 Zimbabwe 192
2937 Zimbabwe 192

2938 rows × 2 columns

In [354]:
le = LabelEncoder()
data['Status_encoded'] = le.fit_transform(data['Status']) 
In [355]:
data[['Status', 'Status_encoded']]
Out[355]:
Status Status_encoded
0 Developing 1
1 Developing 1
2 Developing 1
3 Developing 1
4 Developing 1
... ... ...
2933 Developing 1
2934 Developing 1
2935 Developing 1
2936 Developing 1
2937 Developing 1

2938 rows × 2 columns

In [356]:
data.head(20)
Out[356]:
Country Year Status Life expectancy Adult Mortality infant deaths Alcohol percentage expenditure Hepatitis B Measles BMI under-five deaths Polio Total expenditure Diphtheria HIV/AIDS GDP Population thinness 1-19 years thinness 5-9 years Income composition of resources Schooling Country_encoded Status_encoded
0 Afghanistan 2015 Developing 65.0 263.0 62 0.01 71.279624 65.0 1154 19.1 83 6.0 8.16 65.0 0.1 584.259210 33736494.0 17.2 17.3 0.479 10.1 0 1
1 Afghanistan 2014 Developing 59.9 271.0 64 0.01 73.523582 62.0 492 18.6 86 58.0 8.18 62.0 0.1 612.696514 327582.0 17.5 17.5 0.476 10.0 0 1
2 Afghanistan 2013 Developing 59.9 268.0 66 0.01 73.219243 64.0 430 18.1 89 62.0 8.13 64.0 0.1 631.744976 31731688.0 17.7 17.7 0.470 9.9 0 1
3 Afghanistan 2012 Developing 59.5 272.0 69 0.01 78.184215 67.0 2787 17.6 93 67.0 8.52 67.0 0.1 669.959000 3696958.0 17.9 18.0 0.463 9.8 0 1
4 Afghanistan 2011 Developing 59.2 275.0 71 0.01 7.097109 68.0 3013 17.2 97 68.0 7.87 68.0 0.1 63.537231 2978599.0 18.2 18.2 0.454 9.5 0 1
5 Afghanistan 2010 Developing 58.8 279.0 74 0.01 79.679367 66.0 1989 16.7 102 66.0 9.20 66.0 0.1 553.328940 2883167.0 18.4 18.4 0.448 9.2 0 1
6 Afghanistan 2009 Developing 58.6 281.0 77 0.01 56.762217 63.0 2861 16.2 106 63.0 9.42 63.0 0.1 445.893298 284331.0 18.6 18.7 0.434 8.9 0 1
7 Afghanistan 2008 Developing 58.1 287.0 80 0.03 25.873925 64.0 1599 15.7 110 64.0 8.33 64.0 0.1 373.361116 2729431.0 18.8 18.9 0.433 8.7 0 1
8 Afghanistan 2007 Developing 57.5 295.0 82 0.02 10.910156 63.0 1141 15.2 113 63.0 6.73 63.0 0.1 369.835796 26616792.0 19.0 19.1 0.415 8.4 0 1
9 Afghanistan 2006 Developing 57.3 295.0 84 0.03 17.171518 64.0 1990 14.7 116 58.0 7.43 58.0 0.1 272.563770 2589345.0 19.2 19.3 0.405 8.1 0 1
10 Afghanistan 2005 Developing 57.3 291.0 85 0.02 1.388648 66.0 1296 14.2 118 58.0 8.70 58.0 0.1 25.294130 257798.0 19.3 19.5 0.396 7.9 0 1
11 Afghanistan 2004 Developing 57.0 293.0 87 0.02 15.296066 67.0 466 13.8 120 5.0 8.79 5.0 0.1 219.141353 24118979.0 19.5 19.7 0.381 6.8 0 1
12 Afghanistan 2003 Developing 56.7 295.0 87 0.01 11.089053 65.0 798 13.4 122 41.0 8.82 41.0 0.1 198.728544 2364851.0 19.7 19.9 0.373 6.5 0 1
13 Afghanistan 2002 Developing 56.2 3.0 88 0.01 16.887351 64.0 2486 13.0 122 36.0 7.76 36.0 0.1 187.845950 21979923.0 19.9 2.2 0.341 6.2 0 1
14 Afghanistan 2001 Developing 55.3 316.0 88 0.01 10.574728 63.0 8762 12.6 122 35.0 7.80 33.0 0.1 117.496980 2966463.0 2.1 2.4 0.340 5.9 0 1
15 Afghanistan 2000 Developing 54.8 321.0 88 0.01 10.424960 62.0 6532 12.2 122 24.0 8.20 24.0 0.1 114.560000 293756.0 2.3 2.5 0.338 5.5 0 1
16 Albania 2015 Developing 77.8 74.0 0 4.60 364.975229 99.0 0 58.0 0 99.0 6.00 99.0 0.1 3954.227830 28873.0 1.2 1.3 0.762 14.2 1 1
17 Albania 2014 Developing 77.5 8.0 0 4.51 428.749067 98.0 0 57.2 1 98.0 5.88 98.0 0.1 4575.763787 288914.0 1.2 1.3 0.761 14.2 1 1
18 Albania 2013 Developing 77.2 84.0 0 4.76 430.876979 99.0 0 56.5 1 99.0 5.66 99.0 0.1 4414.723140 289592.0 1.3 1.4 0.759 14.2 1 1
19 Albania 2012 Developing 76.9 86.0 0 5.14 412.443356 99.0 9 55.8 1 99.0 5.59 99.0 0.1 4247.614380 2941.0 1.3 1.4 0.752 14.2 1 1

Now we will Drop the columns which are not required

In [357]:
data3=data.drop(['Adult Mortality','Status','infant deaths','percentage expenditure','Measles','under-five deaths','GDP','Country','Income composition of resources','thinness 5-9 years','Schooling'] ,axis=1)
In [358]:
data3.head(20)
Out[358]:
Year Life expectancy Alcohol Hepatitis B BMI Polio Total expenditure Diphtheria HIV/AIDS Population thinness 1-19 years Country_encoded Status_encoded
0 2015 65.0 0.01 65.0 19.1 6.0 8.16 65.0 0.1 33736494.0 17.2 0 1
1 2014 59.9 0.01 62.0 18.6 58.0 8.18 62.0 0.1 327582.0 17.5 0 1
2 2013 59.9 0.01 64.0 18.1 62.0 8.13 64.0 0.1 31731688.0 17.7 0 1
3 2012 59.5 0.01 67.0 17.6 67.0 8.52 67.0 0.1 3696958.0 17.9 0 1
4 2011 59.2 0.01 68.0 17.2 68.0 7.87 68.0 0.1 2978599.0 18.2 0 1
5 2010 58.8 0.01 66.0 16.7 66.0 9.20 66.0 0.1 2883167.0 18.4 0 1
6 2009 58.6 0.01 63.0 16.2 63.0 9.42 63.0 0.1 284331.0 18.6 0 1
7 2008 58.1 0.03 64.0 15.7 64.0 8.33 64.0 0.1 2729431.0 18.8 0 1
8 2007 57.5 0.02 63.0 15.2 63.0 6.73 63.0 0.1 26616792.0 19.0 0 1
9 2006 57.3 0.03 64.0 14.7 58.0 7.43 58.0 0.1 2589345.0 19.2 0 1
10 2005 57.3 0.02 66.0 14.2 58.0 8.70 58.0 0.1 257798.0 19.3 0 1
11 2004 57.0 0.02 67.0 13.8 5.0 8.79 5.0 0.1 24118979.0 19.5 0 1
12 2003 56.7 0.01 65.0 13.4 41.0 8.82 41.0 0.1 2364851.0 19.7 0 1
13 2002 56.2 0.01 64.0 13.0 36.0 7.76 36.0 0.1 21979923.0 19.9 0 1
14 2001 55.3 0.01 63.0 12.6 35.0 7.80 33.0 0.1 2966463.0 2.1 0 1
15 2000 54.8 0.01 62.0 12.2 24.0 8.20 24.0 0.1 293756.0 2.3 0 1
16 2015 77.8 4.60 99.0 58.0 99.0 6.00 99.0 0.1 28873.0 1.2 1 1
17 2014 77.5 4.51 98.0 57.2 98.0 5.88 98.0 0.1 288914.0 1.2 1 1
18 2013 77.2 4.76 99.0 56.5 99.0 5.66 99.0 0.1 289592.0 1.3 1 1
19 2012 76.9 5.14 99.0 55.8 99.0 5.59 99.0 0.1 2941.0 1.3 1 1
In [359]:
data3.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Year                  2938 non-null   int64  
 1   Life expectancy       2928 non-null   float64
 2   Alcohol               2938 non-null   float64
 3   Hepatitis B           2938 non-null   float64
 4   BMI                   2938 non-null   float64
 5   Polio                 2938 non-null   float64
 6   Total expenditure     2938 non-null   float64
 7   Diphtheria            2938 non-null   float64
 8   HIV/AIDS              2938 non-null   float64
 9   Population            2938 non-null   float64
 10  thinness  1-19 years  2938 non-null   float64
 11  Country_encoded       2938 non-null   int32  
 12  Status_encoded        2938 non-null   int32  
dtypes: float64(10), int32(2), int64(1)
memory usage: 275.6 KB

Converting the Outliers to upper quartile or lower quartile range

In [360]:
numeric_cols=data3.select_dtypes(include='number').columns
In [361]:
for col in numeric_cols:
    Q1 = data3[col].quantile(0.25)
    Q3 = data3[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    data3[col] = data3[col].clip(lower=lower_limit, upper=upper_limit)
In [362]:
data3.head()
Out[362]:
Year Life expectancy Alcohol Hepatitis B BMI Polio Total expenditure Diphtheria HIV/AIDS Population thinness 1-19 years Country_encoded Status_encoded
0 2015 65.0 0.01 65.0 19.1 49.5 8.16 65.0 0.1 3.125506e+07 15.35 0 1
1 2014 59.9 0.01 62.0 18.6 58.0 8.18 62.0 0.1 3.275820e+05 15.35 0 1
2 2013 59.9 0.01 64.0 18.1 62.0 8.13 64.0 0.1 3.125506e+07 15.35 0 1
3 2012 59.5 0.01 67.0 17.6 67.0 8.52 67.0 0.1 3.696958e+06 15.35 0 1
4 2011 59.2 0.01 68.0 17.2 68.0 7.87 68.0 0.1 2.978599e+06 15.35 0 1
In [363]:
df=data3.copy()
In [364]:
df.head(20)
Out[364]:
Year Life expectancy Alcohol Hepatitis B BMI Polio Total expenditure Diphtheria HIV/AIDS Population thinness 1-19 years Country_encoded Status_encoded
0 2015 65.0 0.01 65.0 19.1 49.5 8.16 65.0 0.1 3.125506e+07 15.35 0 1
1 2014 59.9 0.01 62.0 18.6 58.0 8.18 62.0 0.1 3.275820e+05 15.35 0 1
2 2013 59.9 0.01 64.0 18.1 62.0 8.13 64.0 0.1 3.125506e+07 15.35 0 1
3 2012 59.5 0.01 67.0 17.6 67.0 8.52 67.0 0.1 3.696958e+06 15.35 0 1
4 2011 59.2 0.01 68.0 17.2 68.0 7.87 68.0 0.1 2.978599e+06 15.35 0 1
5 2010 58.8 0.01 66.0 16.7 66.0 9.20 66.0 0.1 2.883167e+06 15.35 0 1
6 2009 58.6 0.01 63.0 16.2 63.0 9.42 63.0 0.1 2.843310e+05 15.35 0 1
7 2008 58.1 0.03 64.0 15.7 64.0 8.33 64.0 0.1 2.729431e+06 15.35 0 1
8 2007 57.5 0.02 63.0 15.2 63.0 6.73 63.0 0.1 2.661679e+07 15.35 0 1
9 2006 57.3 0.03 64.0 14.7 58.0 7.43 58.0 0.1 2.589345e+06 15.35 0 1
10 2005 57.3 0.02 66.0 14.2 58.0 8.70 58.0 0.1 2.577980e+05 15.35 0 1
11 2004 57.0 0.02 67.0 13.8 49.5 8.79 49.5 0.1 2.411898e+07 15.35 0 1
12 2003 56.7 0.01 65.0 13.4 49.5 8.82 49.5 0.1 2.364851e+06 15.35 0 1
13 2002 56.2 0.01 64.0 13.0 49.5 7.76 49.5 0.1 2.197992e+07 15.35 0 1
14 2001 55.3 0.01 63.0 12.6 49.5 7.80 49.5 0.1 2.966463e+06 2.10 0 1
15 2000 54.8 0.01 62.0 12.2 49.5 8.20 49.5 0.1 2.937560e+05 2.30 0 1
16 2015 77.8 4.60 99.0 58.0 99.0 6.00 99.0 0.1 2.887300e+04 1.20 1 1
17 2014 77.5 4.51 98.0 57.2 98.0 5.88 98.0 0.1 2.889140e+05 1.20 1 1
18 2013 77.2 4.76 99.0 56.5 99.0 5.66 99.0 0.1 2.895920e+05 1.30 1 1
19 2012 76.9 5.14 99.0 55.8 99.0 5.59 99.0 0.1 2.941000e+03 1.30 1 1
In [365]:
df1=data3.drop('Life expectancy',axis=1)
In [366]:
df1.head()
Out[366]:
Year Alcohol Hepatitis B BMI Polio Total expenditure Diphtheria HIV/AIDS Population thinness 1-19 years Country_encoded Status_encoded
0 2015 0.01 65.0 19.1 49.5 8.16 65.0 0.1 3.125506e+07 15.35 0 1
1 2014 0.01 62.0 18.6 58.0 8.18 62.0 0.1 3.275820e+05 15.35 0 1
2 2013 0.01 64.0 18.1 62.0 8.13 64.0 0.1 3.125506e+07 15.35 0 1
3 2012 0.01 67.0 17.6 67.0 8.52 67.0 0.1 3.696958e+06 15.35 0 1
4 2011 0.01 68.0 17.2 68.0 7.87 68.0 0.1 2.978599e+06 15.35 0 1
In [367]:
for i in df1.select_dtypes(include='number').columns:
    sns.boxplot(data=df1,x=i)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [368]:
for i in df1.select_dtypes(include='number').columns:
    sns.histplot(data=df1,x=i)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Standasation

In [369]:
from sklearn.preprocessing import StandardScaler
In [370]:
scaler = StandardScaler()
standardized_data = scaler.fit_transform(data3)
standardized_data
Out[370]:
array([[ 1.6217623 , -0.44567214, -1.17336066, ...,  2.70888393,
        -1.69104231,  0.        ],
       [ 1.40498625, -0.98260157, -1.17336066, ...,  2.70888393,
        -1.69104231,  0.        ],
       [ 1.1882102 , -0.98260157, -1.17336066, ...,  2.70888393,
        -1.69104231,  0.        ],
       ...,
       [-1.19632639, -2.5723338 , -0.04402256, ..., -0.89203308,
         1.7231814 ,  0.        ],
       [-1.41310244, -2.51969366, -0.73644479, ..., -0.79024037,
         1.7231814 ,  0.        ],
       [-1.62987849, -2.44599746, -0.74666504, ...,  1.60188824,
         1.7231814 ,  0.        ]])
In [371]:
scaler = StandardScaler()
standardized_array = scaler.fit_transform(data3)
standardized_data = pd.DataFrame(standardized_array, columns=data3.columns, index=data3.index)
standardized_data
Out[371]:
Year Life expectancy Alcohol Hepatitis B BMI Polio Total expenditure Diphtheria HIV/AIDS Population thinness 1-19 years Country_encoded Status_encoded
0 1.621762 -0.445672 -1.173361 -1.534064 -0.964715 -2.265630 0.985324 -1.282382 -0.629209 2.650045 2.708884 -1.691042 0.0
1 1.404986 -0.982602 -1.173361 -1.768413 -0.989810 -1.727535 0.994061 -1.472525 -0.629209 -0.826718 2.708884 -1.691042 0.0
2 1.188210 -0.982602 -1.173361 -1.612181 -1.014905 -1.474314 0.972218 -1.345763 -0.629209 2.650045 2.708884 -1.691042 0.0
3 0.971434 -1.024714 -1.173361 -1.377832 -1.040000 -1.157788 1.142595 -1.155619 -0.629209 -0.447944 2.708884 -1.691042 0.0
4 0.754658 -1.056298 -1.173361 -1.299715 -1.060076 -1.094482 0.858634 -1.092238 -0.629209 -0.528699 2.708884 -1.691042 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
2933 -0.762774 -2.624974 -0.061908 -1.299715 -0.563194 -1.157788 0.535355 -1.282382 1.895041 0.572861 1.194717 1.723181 0.0
2934 -0.979550 -2.603918 -0.138560 -2.053448 -0.583270 -2.265630 0.268868 -1.092238 1.895041 0.556717 1.296510 1.723181 0.0
2935 -1.196326 -2.572334 -0.044023 -0.909134 -0.603346 -0.777956 0.273237 -0.902095 1.895041 -0.849432 -0.892033 1.723181 0.0
2936 -1.413102 -2.519694 -0.736445 -0.674785 -0.623422 -0.588040 0.111598 -0.648570 1.895041 0.526619 -0.790240 1.723181 0.0
2937 -1.629878 -2.445997 -0.746665 -0.440436 -0.643498 -0.461429 0.522249 -0.458427 1.895041 0.510441 1.601888 1.723181 0.0

2938 rows × 13 columns

In [372]:
for i in standardized_data.select_dtypes(include='number').columns:
    sns.histplot(data=standardized_data,x=i)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [373]:
for i in standardized_data.select_dtypes(include='number').columns:
    sns.boxplot(data=standardized_data,x=i)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Normalization

In [374]:
from sklearn.preprocessing import MinMaxScaler
In [375]:
scaler =MinMaxScaler()
normalized_data = scaler.fit_transform(data3)
normalized_data
Out[375]:
array([[1.        , 0.46428571, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.93333333, 0.35044643, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.86666667, 0.35044643, 0.        , ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.13333333, 0.01339286, 0.26268479, ..., 0.07213115, 1.        ,
        0.        ],
       [0.06666667, 0.02455357, 0.10162692, ..., 0.09836066, 1.        ,
        0.        ],
       [0.        , 0.04017857, 0.09924968, ..., 0.7147541 , 1.        ,
        0.        ]])
In [376]:
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(data3)
normalized_data = pd.DataFrame(normalized_data, columns=data3.columns, index=data3.index)
normalized_data
Out[376]:
Year Life expectancy Alcohol Hepatitis B BMI Polio Total expenditure Diphtheria HIV/AIDS Population thinness 1-19 years Country_encoded Status_encoded
0 1.000000 0.464286 0.000000 0.163568 0.209733 0.000000 0.683333 0.313131 0.0 1.000000 1.000000 0.0 0.0
1 0.933333 0.350446 0.000000 0.089765 0.203940 0.171717 0.685088 0.252525 0.0 0.010480 1.000000 0.0 0.0
2 0.866667 0.350446 0.000000 0.138967 0.198146 0.252525 0.680702 0.292929 0.0 1.000000 1.000000 0.0 0.0
3 0.800000 0.341518 0.000000 0.212770 0.192352 0.353535 0.714912 0.353535 0.0 0.118283 1.000000 0.0 0.0
4 0.733333 0.334821 0.000000 0.237371 0.187717 0.373737 0.657895 0.373737 0.0 0.095299 1.000000 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
2933 0.266667 0.002232 0.258525 0.237371 0.302433 0.353535 0.592982 0.313131 1.0 0.408813 0.609836 1.0 0.0
2934 0.200000 0.006696 0.240695 0.000000 0.297798 0.000000 0.539474 0.373737 1.0 0.404219 0.636066 1.0 0.0
2935 0.133333 0.013393 0.262685 0.360375 0.293163 0.474747 0.540351 0.434343 1.0 0.004015 0.072131 1.0 0.0
2936 0.066667 0.024554 0.101627 0.434178 0.288528 0.535354 0.507895 0.515152 1.0 0.395653 0.098361 1.0 0.0
2937 0.000000 0.040179 0.099250 0.507981 0.283893 0.575758 0.590351 0.575758 1.0 0.391048 0.714754 1.0 0.0

2938 rows × 13 columns

In [377]:
for i in normalized_data.select_dtypes(include='number').columns:
    sns.boxplot(data=normalized_data,x=i)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [378]:
for i in normalized_data.select_dtypes(include='number').columns:
    sns.histplot(data=normalized_data,x=i ,kde=True)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [379]:
c1=df1.select_dtypes(include='number').corr()
In [380]:
plt.figure(figsize=(10,15))
sns.heatmap(c1,annot=True)
Out[380]:
<Axes: >
No description has been provided for this image
In [381]:
df1=df1.drop(['Population','HIV/AIDS'],axis=1)
In [382]:
sns.pairplot(df1)
Out[382]:
<seaborn.axisgrid.PairGrid at 0x1b7e028c230>
No description has been provided for this image
In [383]:
sns.pairplot(df1,diag_kind='kde')
Out[383]:
<seaborn.axisgrid.PairGrid at 0x1b7e3cb1a60>
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: